%load_ext nb_black
import pandas as pd
import numpy as np
import io
import requests
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import plot
import plotly.io as pio
pio.templates.default = "plotly_white"
def get_ts_df(df_pd_raw, col_name):
df_pd_raw.columns = ["state", "country", "latitude", "longitude"] + [
c for c in df_pd_raw.columns[4:]
]
df_pd_ts_series = df_pd_raw.set_index(
["state", "country", "latitude", "longitude"]
).stack()
df_pd_ts_series = pd.DataFrame(df_pd_ts_series).reset_index(drop=False)
df_pd_ts_series.columns = [
"state",
"country",
"latitude",
"longitude",
"timestamp",
col_name,
]
df_pd_ts_series["timestamp"] = pd.to_datetime(df_pd_ts_series["timestamp"])
return df_pd_ts_series
def get_country_aggs(df_pd_raw, col_name):
return (
df_pd_raw.groupby(["country", "timestamp"])
.agg({"latitude": "mean", "longitude": "mean", col_name: "sum"})
.reset_index(drop=False)
)
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
url_deaths = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv"
url_recovered = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv"
s = requests.get(url).content
df_pd_raw_cases = pd.read_csv(io.StringIO(s.decode("utf-8")))
s = requests.get(url_deaths).content
df_pd_raw_deaths = pd.read_csv(io.StringIO(s.decode("utf-8")))
s = requests.get(url_recovered).content
df_pd_raw_recovered = pd.read_csv(io.StringIO(s.decode("utf-8")))
df_pd_cases = get_country_aggs(get_ts_df(df_pd_raw_cases, "numCases"), "numCases")
df_pd_deaths = get_country_aggs(get_ts_df(df_pd_raw_deaths, "numDeaths"), "numDeaths")
df_pd_recovered = get_country_aggs(
get_ts_df(df_pd_raw_recovered, "numRecovered"), "numRecovered"
)
df_pd_country_first = (
df_pd_cases[df_pd_cases["numCases"] > 0]
.groupby("country")
.agg({"timestamp": "min"})
.reset_index(drop=False)
)
df_pd_country_first.columns = ["country", "firstTimestamp"]
df_pd_country = pd.merge(
df_pd_cases,
df_pd_deaths[["country", "timestamp", "numDeaths"]],
on=["country", "timestamp"],
how="outer",
)
df_pd_country = pd.merge(
df_pd_country,
df_pd_recovered[["country", "timestamp", "numRecovered"]],
on=["country", "timestamp"],
how="outer",
)
df_pd_country = pd.merge(df_pd_country, df_pd_country_first, on="country", how="left")
df_pd_country["daysSince"] = (
df_pd_country["timestamp"] - df_pd_country["firstTimestamp"]
).dt.days
min_ts = df_pd_country["timestamp"].min().strftime("%Y-%m-%d")
col_days_from_beginning = f"daysFrom{min_ts}"
df_pd_country[col_days_from_beginning] = (
df_pd_country["timestamp"] - df_pd_country["timestamp"].min()
).dt.days
df_pd_stacked = pd.DataFrame(
df_pd_country.set_index(
["country", "timestamp", "latitude", "longitude", "firstTimestamp", "daysSince"]
)[["numCases", "numDeaths", "numRecovered"]].stack()
).reset_index(drop=False)
df_pd_stacked.columns = [
"country",
"timestamp",
"latitude",
"longitude",
"firstTimestamp",
"daysSince",
"type",
"cases",
]
px.line(
df_pd_stacked[df_pd_stacked["daysSince"] >= 0],
x="daysSince",
y="cases",
color="country",
facet_row="type",
hover_name="timestamp",
height=1000,
title="Increases from 1st day case was reported",
)
df_pd_country[df_pd_country["country"] == "US"]
top_countries = (
df_pd_country.groupby("country")
.agg({"numCases": "sum"})
.sort_values("numCases", ascending=False)
.head(50)
.index.tolist()
)
# top_countries.remove("China")
df_pd = (
df_pd_country[(df_pd_country["country"]).isin(top_countries)]
.sort_values("timestamp")
.copy()
)
df_pd = df_pd.fillna(1)
df_pd["deathRate"] = (df_pd["numDeaths"] / df_pd["numCases"]).fillna(0)
df_pd["ts_str"] = df_pd["timestamp"].astype(str)
df_pd["deathPer1M"] = (df_pd["numDeaths"] * 1e6 / df_pd["numCases"]).fillna(0) + 1
df_pd["numDeaths"] = df_pd["numDeaths"] + 1
df_pd["numCases"] = df_pd["numCases"] + 1
x = "numCases"
y = "numDeaths"
size = "deathPer1M"
min_ts = df_pd["timestamp"].min().strftime("%Y-%m-%d")
max_ts = df_pd["timestamp"].max().strftime("%Y-%m-%d")
fig = px.scatter(
df_pd[df_pd[col_days_from_beginning] > 0].sort_values("timestamp"),
x=x,
y=y,
animation_frame="ts_str",
animation_group="country",
size=size,
color="country",
hover_name="timestamp",
range_x=[1, 200e3],
range_y=[1, 1e4],
size_max=55,
color_discrete_sequence=px.colors.qualitative.Dark24,
text="country",
log_x=True,
log_y=True,
title=f"Covid-19: Top-50 countries; {min_ts} -- {max_ts} <br> Size of the bubble = Num deaths per million",
opacity=0.75,
)
fig.update_traces(textposition="top center", textfont=dict(size=10, color="gray"))
fig.update_layout(
xaxis=dict(title="Total Num Cases"), yaxis=dict(title="Total Num Deaths"),
)
plot(fig, filename=f"covid.html", auto_open=True)
df_pd_1day_lag = (
df_pd_country.sort_values(["country", "timestamp"])
.set_index(["timestamp", "country"])
.groupby(level="country")
.shift(1)
)[["numCases", "numDeaths", "numRecovered"]]
df_pd_1day_lag.columns = ["prevNumCases", "prevNumDeaths", "prevNumRecovered"]
df_pd_country = df_pd_country.set_index(["timestamp", "country"]).join(
df_pd_1day_lag, how="left"
)
df_pd_country["diffNumCases"] = (
df_pd_country["numCases"] - df_pd_country["prevNumCases"]
)
df_pd_country["rateIncreaseNumCases"] = (
df_pd_country["numCases"] / df_pd_country["prevNumCases"]
)
df_pd_country["diffNumDeaths"] = (
df_pd_country["numDeaths"] - df_pd_country["prevNumDeaths"]
)
df_pd_country["rateIncreaseNumDeaths"] = (
df_pd_country["numDeaths"] / df_pd_country["prevNumDeaths"]
)
df_pd_country["diffNumRecovered"] = (
df_pd_country["numRecovered"] - df_pd_country["prevNumRecovered"]
)
df_pd_country["rateIncreaseNumRecovered"] = (
df_pd_country["numRecovered"] / df_pd_country["prevNumRecovered"]
)
df_pd_country["deathRate"] = (
df_pd_country["numDeaths"] / df_pd_country["numCases"]
).fillna(0)
df_pd_country = df_pd_country.reset_index(drop=False)
px.line(
df_pd_country.reset_index(drop=False),
x="timestamp",
y="rateIncreaseNumCases",
color="country",
title="Rate of increase of reported num cases between consecutive days",
)
px.line(
df_pd_country.reset_index(drop=False),
x="timestamp",
y="deathRate",
color="country",
title="Death rate",
)
from scipy import optimize
from lmfit import Model
def get_exp_fit_for_country(df_pd_country, country, metric="numCases"):
df_pd_per_country = (
df_pd_country[
(df_pd_country["country"] == country) & (df_pd_country["daysSince"] > 0)
]
.sort_values("daysSince")
.copy()
)
exp_model = Model(lambda t, a, b, t0: a * np.exp(b * (t - t0)))
result = exp_model.fit(
df_pd_per_country[metric]
+ np.random.normal(0, 0.2, df_pd_per_country.shape[0]),
t=df_pd_per_country["daysSince"],
a=1.5,
b=2.4,
t0=1,
)
metric_camel = str(metric[0]).upper() + metric[1:]
df_pd_per_country[f"bestFit{metric_camel}"] = result.best_fit
return result, df_pd_per_country
model_results = {}
result_fit_df = {}
last_jump_dict = {}
for country in top_countries:
print(f"Country={country}")
result, df_pd_per_country = get_exp_fit_for_country(df_pd_country, country)
dely = result.eval_uncertainty(sigma=3)
df_pd_per_country["bestFitNumCasesLb"] = result.best_fit - dely
df_pd_per_country["bestFitNumCasesUb"] = result.best_fit + dely
df_pd_per_country["normResidual"] = (
df_pd_per_country["numCases"] - df_pd_per_country["bestFitNumCases"]
) / df_pd_per_country["numCases"]
model_results[country] = result
result_fit_df[country] = df_pd_per_country
last_jump_dict[country] = (
df_pd_per_country["normResidual"].tolist()[-1]
if df_pd_per_country.shape[0] > 10
else None
)
# print(result.fit_report())
df_pd_fig = df_pd_per_country[
[
"daysSince",
"numCases",
"bestFitNumCasesLb",
"bestFitNumCases",
"bestFitNumCasesUb",
]
]
df_pd_fig.columns = [
"daysSince",
"actualNumCases",
"bestFitNumCasesLb",
"bestFitNumCases",
"bestFitNumCasesUb",
]
df_pd_fig = pd.melt(
df_pd_fig,
id_vars=["daysSince"],
value_vars=[
"actualNumCases",
"bestFitNumCasesLb",
"bestFitNumCases",
"bestFitNumCasesUb",
],
)
df_pd_fig.columns = ["daysSince", "type", "numCases"]
fig = px.line(
df_pd_fig,
x="daysSince",
y="numCases",
color="type",
title=f"Exp-fit: {country}",
)
fig.show()
df_country_fit_results = pd.DataFrame(
[
[
country,
model_results[country].result.bic,
model_results[country].result.aic,
model_results[country].result.chisqr,
model_results[country].result.params["a"].value,
model_results[country].result.params["b"].value,
model_results[country].result.params["t0"].value,
model_results[country].ndata,
last_jump_dict[country],
]
for country in model_results.keys()
],
columns=[
"country",
"bic",
"aic",
"chisq",
"paramA",
"paramExponent",
"paramT0",
"numPoints",
"normResidualLast",
],
).sort_values("bic")
df_pd_fig = pd.melt(
df_country_fit_results, id_vars=["country"], value_vars=["aic", "bic"],
)
df_pd_fig.columns = ["country", "type", "metric"]
fig = px.scatter(
df_country_fit_results.sort_values("paramExponent", ascending=False),
x="paramExponent",
y="bic",
color="country",
size="numPoints",
color_discrete_sequence=px.colors.qualitative.Dark24,
title="Exponents and BIC for countries <br>Size=num valid data points",
opacity=0.5,
text="country",
)
fig.update_traces(textposition="top center", textfont=dict(size=8, color="gray"))
fig.update_layout(
xaxis=dict(title="Exponent <br> (greater exponent means sharper rise)"),
yaxis=dict(title="BIC <br> (less BIC means more confidence in the model)"),
)
fig.show()
px.bar(
df_pd_fig,
x="country",
y="metric",
color="type",
barmode="group",
title="Model accuracy metrics",
)
fig = px.bar(
df_country_fit_results.sort_values("normResidualLast"),
x="country",
y="normResidualLast",
color="numPoints",
color_continuous_scale=["lightgray", "black"],
title="Is the previous day's jump signifying larger or smaller than expected jump? <br> Color indicates number of points used to model. Greater number of points is preferred",
)
fig.update_layout(
yaxis=dict(
title="Normalized residual on the last day <br> = (Actual cases - Predicted cases)/Actual Cases <br> (greater value means sharper rise than expected)"
),
)
fig.show()